import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('phonespecs.csv')
print(len(df), "phones read")
print()
# Set numeric data types and find rows with one (or more) NaN in numeric columns
numeric_cols = df.columns[2:10]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
valid_number_rows = ~df[numeric_cols].isna().any(axis=1)
#############################################################################################
# Keep only phones from brands with sufficiently many phones and also released phone recently
counts = df.groupby(['Brand']).Brand.count() # Count number of phones for each brand
major_brand_by_counts = counts[counts >= counts['Google']] # Brands with #phones as many as Pixel
mostrecent = df.groupby(['Brand'], sort=False).Released.max() # Find the most recent release of brand
major_brand_by_recent = mostrecent[mostrecent >= 2021] # Keep only brands that released after 2020
major_brand_rows = df.Brand.isin(major_brand_by_counts.index) \
& df.Brand.isin(major_brand_by_recent.index)
print("Keeping only phones from:", ', '.join(["{} ({})".format(b, c) for b, c in
counts[major_brand_by_counts.index.intersection(major_brand_by_recent.index)].iteritems()]))
print()
rows = (valid_number_rows & major_brand_rows)
data = (df[numeric_cols])[rows].to_numpy()
labels = np.array(df['Model'].to_numpy()[rows])
brands = np.array(df['Brand'].to_numpy()[rows])
print("Total", len(data), "rows of", list(numeric_cols), "phone specs collected")
16614 phones read Keeping only phones from: Alcatel (395), Apple (402), Archos (69), Asus (363), Coolpad (154), Google (68), Huawei (1153), Kyocera (83), LG (1243), Lenovo (257), Meizu (217), Motorola (762), Nokia (470), OnePlus (138), Oppo (849), Samsung (2389), Sharp (136), Sony (363), T-Mobile (98), TCL (134), Vivo (666), Wiko (91), Xiaomi (759), ZTE (648) Total 11239 rows of ['Released', 'Width', 'Height', 'Depth', 'Weight', 'Display', 'Ratio', 'Battery'] phone specs collected
index = np.nonzero(np.core.defchararray.find(labels.astype('str'), "iPhone 1st")!=-1)[0]
rows = np.where(data[:,0] >= data[index,0])[0]
data = data[rows]
labels = labels[rows]
brands = brands[rows]
print(len(data), "rows remain after removing phones before iPhone 1st")
11130 rows remain after removing phones before iPhone 1st
import editdistance
def has_similar_names(i, j):
if brands[i] != brands[j]:
return False
l = min(len(labels[i]), len(labels[j]))
dist = editdistance.eval(labels[i], labels[j])
return dist < l//2
removed = np.full((len(data)), False)
order = labels.argsort()
data = data[order]
labels = labels[order]
brands = brands[order]
for i in range(len(data)):
if removed[i]:
continue
for j in range(i+1, min(i+1000, len(data))):
if removed[j]:
continue
if (data[i] == data[j]).all() and has_similar_names(i, j):
removed[j] = True
data = data[np.where(~removed)]
labels = labels[np.where(~removed)]
brands = brands[np.where(~removed)]
print(len(data), "rows remain after removing variants of the same phone")
5519 rows remain after removing variants of the same phone
cutoff_ratio = 1.0
good_ratio_rows = np.where(data[:,6] >= cutoff_ratio)[0]
data = data[good_ratio_rows]
labels = labels[good_ratio_rows]
brands = brands[good_ratio_rows]
print(len(data), "rows remain after removing phones with larger width than height")
5407 rows remain after removing phones with larger width than height
Phones of overly large screen size (>=7.5in)
large_scr_rows = np.where(data[:,5] >= 7.5)
for b, m, s, y in zip(brands[large_scr_rows], labels[large_scr_rows],
data[large_scr_rows, 5][0], data[large_scr_rows, 0][0]):
print("{:7} {:70} {:5.2f} in {:5}".format(b, m, s, int(y)))
Xiaomi Mi Mix Alpha 5G Dual SIM TD-LTE CN 512GB 7.92 in 2020 Xiaomi Mi Mix Fold 2021 5G Ceramic Special Edition Dual SIM TD-LTE CN 512GB 8.01 in 2021 Xiaomi Mi Mix Fold 2021 5G Premium Edition Dual SIM TD-LTE CN 256GB 8.01 in 2021
Phones of screen size between 6.9in and 7.5in
large_scr_rows = np.where((data[:,5] >= 6.9) & (data[:,5] < 7.5))
_data = np.stack((brands[large_scr_rows], labels[large_scr_rows], data[large_scr_rows, 5][0],
data[large_scr_rows, 4][0], data[large_scr_rows, 0][0])).transpose()
_data = sorted(_data, key=lambda x: x[3])
for b, m, s, w, y in _data:
print("{:10} {:70} {:5.2f}in {:4}g {}".format(b, m, s, int(w), int(y)))
ZTE Axon 30 5G Premium Edition Dual SIM TD-LTE CN 128GB A2322 6.92in 189g 2021 Huawei P50 Pocket 4G Premium Art Edition Dual SIM TD-LTE CN 512GB BAL-AL00 6.90in 190g 2021 Huawei P50 Pocket 4G Premium Edition Global Dual SIM TD-LTE 512GB BAL-L49 6.90in 190g 2022 ZTE Axon 20 4G Global Dual SIM TD-LTE 128GB A2121E 6.92in 198g 2020 ZTE Axon 20 5G Global Dual SIM TD-LTE 128GB A2121G 6.92in 198g 2021 ZTE Axon 20 5G Standard Edition Dual SIM TD-LTE CN 128GB A2121 / A20 6.92in 198g 2020 Huawei Enjoy 9 Max Dual SIM TD-LTE CN ARS-AL00 128GB / Changxiang Max 7.12in 210g 2018 Huawei Honor 8X Max 4G+ Standard Edition Dual SIM TD-LTE CN 128GB ARE-TL00 7.12in 210g 2018 Huawei Honor 8X Max Premium Edition Dual SIM TD-LTE CN 64GB ARE-AL10 7.12in 210g 2018 Huawei Y Max Dual SIM TD-LTE APAC ARS-LX2 / Honor 8X Max ARS-L22 7.12in 210g 2018 Xiaomi Mi Max 3 Dual SIM TD-LTE 128GB M1804E4A / M1804E4C 6.90in 221g 2018 ZTE Rakuten BIG 5G TD-LTE JP 6.92in 227g 2020 Huawei Honor Note 10 Premium Edition Dual SIM TD-LTE CN RVL-AL09 128GB 6.95in 230g 2018 Huawei Mate 20 X Dual SIM TD-LTE CN 128GB EVR-AL00 7.17in 232g 2018 Huawei Mate 20 X Global Dual SIM TD-LTE 128GB EVR-L29 7.17in 232g 2019 Xiaomi Black Shark 3 Pro 5G Premium Edition Dual SIM TD-LTE CN 256GB MBU-A0 7.09in 253g 2020 Lenovo Legion Phone 2 Pro 5G Premium Edition Dual SIM TD-LTE CN 256GB L70081 6.92in 262g 2021
remove = (data[:,5] >= 7.5) | ((data[:,5] >= 6.9)&(data[:,5] < 7.5)&(data[:,4] > 210))
data = data[np.where(~remove)]
labels = labels[np.where(~remove)]
brands = brands[np.where(~remove)]
print(len(data), "rows remain after removing tablets and foldables")
5397 rows remain after removing tablets and foldables
years = list(range(int(data[:,0].min()), int(data[:,0].max())+1))
num_phones = [np.where(data[:,0]==y)[0].shape[0] for y in years]
avg_weight = [data[np.where(data[:,0]==y),4][0].mean() for y in years]
avg_weight_error = [data[np.where(data[:,0]==y),4][0].std() for y in years]
avg_scr_size = [data[np.where(data[:,0]==y),5][0].mean() for y in years]
avg_scr_size_error = [data[np.where(data[:,0]==y),5][0].std() for y in years]
avg_scr_ratio = [data[np.where(data[:,0]==y),6][0].mean() for y in years]
avg_scr_ratio_error = [data[np.where(data[:,0]==y),6][0].std() for y in years]
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(17, 8))
plt.subplots_adjust(hspace=0.35)
ax1.bar(years, num_phones)
ax1.set_title('Number of Phones Released Each Year')
ax2.errorbar(years, avg_weight, yerr=avg_weight_error, capsize=7)
ax2.set_ylim([0, 230])
ax2.set_title('Average Weight (g)')
ax3.errorbar(years, avg_scr_size, yerr=avg_scr_size_error, capsize=7)
ax3.set_ylim([0, 8])
ax3.set_title('Average Screen Diagonal Length (inches)')
ax4.errorbar(years, avg_scr_ratio, yerr=avg_scr_ratio_error, capsize=7)
ax4.set_title('Average Screen Ratio')
plt.show()
fig, (ax1, ax2) = plt.subplots(2, figsize=(17, 11))
plt.subplots_adjust(hspace=0)
disp_size_by_year = [data[np.where(data[:,0]==y),5][0] for y in years]
ax1.hist(data[np.isin(data[:,0], years),5], bins=100)
ax2.hist(disp_size_by_year, bins=100, histtype='bar', stacked=True)
ax2.set_xlabel('Display Size (inch)')
ax2.legend(years)
ax1.set_ylabel('Distribution')
ax2.set_ylabel('Distribution')
plt.figure(figsize=(16, 5))
scr_ratio_by_year = [data[np.where(data[:,0]==y),6][0] for y in years]
plt.hist(scr_ratio_by_year, histtype='bar', stacked=True, orientation="horizontal")
plt.ylabel('Screen Ratio')
plt.xlabel('Distribution')
plt.legend(years, loc=(1.01,0.1))
plt.show()
# Set threshold for establishing an edge between two models:
# Released(y), Width(mm), Height(mm), Depth(mm), Weight(g), Display(mm), Ratio, Battery
thresholds = [None, 1, 1, 1, None, 0.5, 0.1, None]
adj = np.ones((len(data), len(data)))
np.fill_diagonal(adj, 0)
for i, j in [(i, j) for i in range(len(data)) for j in range(i+1, len(data))]:
for c, (feat_i, feat_j) in enumerate(zip(data[i,:], data[j,:])):
if thresholds[c] == None:
continue
if abs(feat_i - feat_j) > thresholds[c]:
adj[i, j] = adj[j, i] = 0
plt.figure(figsize=(18,18))
plt.imshow(adj)
plt.show()
import palsgraph
G = palsgraph.make_graph(adj, labels=list(labels), show_singletons=False)
import networkx as nx
comp = nx.algorithms.community.centrality.girvan_newman(G)
from itertools import islice
from matplotlib import patheffects
label2scrsize = dict()
for i, l in enumerate(labels):
label2scrsize[l] = "{:.3}".format(data[i,5])
communities = next(iter(comp))
# Find optimal positions for displaying the communities in the graph
pos = palsgraph.getpos(G, communities)
# Generate a colormap
color_map = palsgraph.gen_colormap(G, communities)
# Draw graph
plt.figure(figsize=(17, 15))
nx.draw(G, pos=pos, node_color=color_map, edge_color='grey', with_labels=False)
# Label each community with its index in comm
for index, comm in enumerate(communities):
avg_pos = np.array([pos[c] for c in comm]).mean(axis=0)
txt = plt.text(avg_pos[0], avg_pos[1], index, fontsize=14)
txt.set_path_effects([patheffects.withStroke(linewidth=3, foreground='w')])
from matplotlib.patches import Patch
### Codes for generating legends
def find_matching_row(color_map, color):
for i in range(len(color_map)):
if (color_map[i] == color).all():
return(i)
def build_legends(G, labels, color_map, legend_names):
colors = np.unique(np.array(color_map), axis=0)
handles = []
legend_tuples = []
for i in range(len(colors)):
label = list(G.nodes)[find_matching_row(color_map, colors[i])]
name = legend_names[np.where(labels==label)[0]][0]
legend_tuples.append((colors[i], name))
legend_tuples = sorted(legend_tuples, key=lambda x: x[1])
for c, l in legend_tuples:
handles.append(Patch(color=c, label=l))
return handles
### Plot graph colored by brand
# Generate a colormap
color_groups = [labels[np.where(brands==brand)] for brand in np.unique(brands)]
color_map = palsgraph.gen_colormap(G, color_groups)
# Draw graph
plt.figure(figsize=(17, 15))
nx.draw(G, pos=pos, node_color=color_map, edge_color='grey', with_labels=False)
plt.legend(handles=build_legends(G,labels,color_map,brands), fontsize=20, ncol=6, loc='upper center')
# Label each community with its average display size
for comm in communities:
avg_disp_size = "{:.2f}".format(np.mean([float(label2scrsize[c]) for c in comm]))
avg_pos = np.array([pos[c] for c in comm]).mean(axis=0)
txt = plt.text(avg_pos[0], avg_pos[1], avg_disp_size, fontsize=14)
txt.set_path_effects([patheffects.withStroke(linewidth=3, foreground='w')])
### Plot graph colored by year
# Generate a colormap
color_groups = [labels[np.where(data[:,0]==year)] for year in np.unique(data[:,0])]
color_map = palsgraph.gen_colormap(G, color_groups)
# Draw graph
plt.figure(figsize=(17, 15))
nx.draw(G, pos=pos, node_color=color_map, edge_color='grey', with_labels=False)
years = data[:,0]
plt.legend(handles=build_legends(G, labels, color_map, data[:,0].astype('int')),
fontsize=20, ncol=5, loc='upper center')
# Label each community with its display size
for comm in communities:
avg_disp_size = "{:.2f}".format(np.mean([float(label2scrsize[c]) for c in comm]))
avg_pos = np.array([pos[c] for c in comm]).mean(axis=0)
txt = plt.text(avg_pos[0], avg_pos[1], avg_disp_size, fontsize=14)
txt.set_path_effects([patheffects.withStroke(linewidth=3, foreground='w')])
import matplotlib.image as mpimg
import random
import os
label2img = np.genfromtxt('./label2img.csv', delimiter=',', dtype='str')
label2img = dict([(label, img) for label, img in label2img])
num_phones_shown = 7
min_comm_size = 20
choice_communities = random.sample([comm for comm in communities if len(comm) >= min_comm_size], 5)
lbll2idx = dict()
for i, l in enumerate(labels):
lbll2idx[l] = i
for comm in choice_communities:
comm_size = len(comm)
comm = random.sample(comm, num_phones_shown) # Choose num_phones_shown phones from community
avg_disp_size = "{:.2f}".format(np.mean([float(data[lbll2idx[c],5]) for c in comm]))
avg_width = "{:.2f}".format(np.mean([float(data[lbll2idx[c],1]) for c in comm]))
avg_height = "{:.2f}".format(np.mean([float(data[lbll2idx[c],2]) for c in comm]))
avg_depth = "{:.2f}".format(np.mean([float(data[lbll2idx[c],3]) for c in comm]))
comm_desc = "Community size: {}, Averages: display {}, height {}, width {}, depth {}".format( \
comm_size, avg_disp_size, avg_height, avg_width, avg_depth)
fig, ax = plt.subplots(1, 1, figsize=(15, 0.05))
ax.axis('off')
ax.text(0, 0, comm_desc, bbox=None)
fig, axs = plt.subplots(1, num_phones_shown, figsize=(18, 3),
gridspec_kw={'width_ratios': [1]*num_phones_shown})
for (i, ax), c in zip(enumerate(axs), comm):
img = mpimg.imread(os.path.join("images", label2img[c]))
axs[i].axis('off')
txt = axs[i].text(img.shape[1]/2, img.shape[0]/2, brands[lbll2idx[c]], ha='center', bbox=None)
txt.set_path_effects([patheffects.withStroke(linewidth=5, foreground='w')])
txt = axs[i].text(img.shape[1]/2, img.shape[0]/2+50, int(data[lbll2idx[c],0]),
ha='center', bbox=None)
txt.set_path_effects([patheffects.withStroke(linewidth=5, foreground='w')])
axs[i].imshow(img)
def get_info(label):
brand = "Maker: {}".format(brands[np.where(labels==label)[0]][0])
year = "Released year: {}".format(int(data[np.where(labels==label)[0],0]))
width = data[np.where(labels==label)[0],1][0]
height = data[np.where(labels==label)[0],2][0]
depth = data[np.where(labels==label)[0],3][0]
dimension = "Dimension: {:.1f}mm x {:.1f}mm x {:.1f}mm".format(height, width, depth)
weight = "Weight: {:.2f}g".format(data[np.where(labels==label)[0],4][0])
displaysize = "Display: {}in".format(data[np.where(labels==label)[0],5][0])
return (brand, year, dimension, weight, displaysize)
comm1 = communities[1]
comm2 = communities[4]
edges = [(node1, node2) for node1 in comm1 for node2 in comm2 if (node1, node2) in G.edges]
for n1, d1, n2, d2 in [(n1, data[np.where(labels==n1)[0]][0], n2, data[np.where(labels==n2)[0]][0])
for (n1, n2) in edges]:
img1 = mpimg.imread(os.path.join("images", label2img[n1]))
img2 = mpimg.imread(os.path.join("images", label2img[n2]))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 3),
gridspec_kw={'width_ratios': [1, 1.2, 1, 1.2]})
ax1.axis('off')
ax2.axis('off')
ax3.axis('off')
ax4.axis('off')
ax1.imshow(img1)
info = get_info(n1)
ax2.text(0, 0.9, n1, bbox=None)
ax2.text(0, 0.8, info[0], bbox=None)
ax2.text(0, 0.7, info[1], bbox=None)
ax2.text(0, 0.6, info[2], bbox=None)
ax2.text(0, 0.5, info[3], bbox=None)
ax2.text(0, 0.4, info[4], bbox=None)
ax3.imshow(img2)
info = get_info(n2)
ax4.text(0, 0.9, n2, bbox=None)
ax4.text(0, 0.8, info[0], bbox=None)
ax4.text(0, 0.7, info[1], bbox=None)
ax4.text(0, 0.6, info[2], bbox=None)
ax4.text(0, 0.5, info[3], bbox=None)
ax4.text(0, 0.4, info[4], bbox=None)
import dill
del(comp) # Remove comp since this datatype can't be saved
dill.dump_session('./phone_analysis.pkl')